Predict the popularity of board games on BoardGameGeek.com
library(tidyverse)
library(tidymodels)
library(scales)
library(skimr)
library(kableExtra)
# colours
#pal_jo <- c(viridisLite::magma(8)[2:8], "#4C4C53", "#9B9BA8")
pal_sliced <- c(
'#e946a5', # pink
'#68e199', # green
'#302df0', # blue
'#6a45b0', # purple
'#129875', # darker green
'#4f94bf', # lighter blue
'#b545ab', # darker pink
'#000000', # black,
'#cccccc' #grey
)
title_colour <- pal_sliced[1]
table_colour <- pal_sliced[4]
bar_colour <- pal_sliced[2]
grid_colour <- pal_sliced[9]
theme_set(theme_bw() %+replace%
theme(
# align title and caption to the plot not the panel
plot.title.position = 'plot',
plot.caption.position = 'plot',
# change the title and caption to markdown and move them futher from the plot
plot.title = element_text(
size = rel(1.3),
hjust = 0,
margin = margin(c(0, 0, 10, 0)),
colour = title_colour
),
plot.subtitle = element_text(
size = rel(1.15),
hjust = 0,
margin = margin(c(0, 0, 15, 0))
),
plot.caption = element_text(
hjust = 1,
margin = margin(c(10, 0, 0, 0))
),
# move axis titles to the left/top and change them to markdown
axis.title = element_text(hjust = 1),
# allow the axis values to the markdown as well
axis.text = element_text(),
# remove the panel border
panel.border = element_blank(),
# put in the axis lines with a slightly thicker line than the gridlines
axis.line = element_line(colour = grid_colour, size = rel(1.5)),
# make the tickmarks the same colour
axis.ticks = element_line(colour = grid_colour),
# facet strip text left aligned with extra space above
strip.text = element_text(
hjust = 0, margin = margin(c(10, 0, 0, 0)), colour = title_colour
),
# clear colour and fill for strip
strip.background = element_rect(colour = NA, fill = NA),
# dotted gridlines
panel.grid = element_line(linetype = 'dotted'),
# ability to use a different colour for the gridlines
panel.grid.major.x = element_line(colour = grid_colour),
panel.grid.major.y = element_line(colour = grid_colour),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
)
)
scale_y_pct <- function(
accuracy = 1L,
breaks = pretty_breaks(),
expand = expansion(mult = c(0, .05)),
...
) {
scale_y_continuous(
labels = scales::percent_format(accuracy = accuracy, big.mark = ","),
breaks = breaks,
expand = expand,
...
)
}
scale_y_comma <- function(
accuracy = 1L,
breaks = pretty_breaks(),
expand = expansion(mult = c(0, .05)),
...
) {
scale_y_continuous(
labels = scales::comma_format(accuracy = accuracy),
breaks = breaks,
expand = expand,
...
)
}
scale_fill_jo <- function(...) {
scale_fill_manual(values = pal_sliced, ...)
}
scale_fill_discrete <- scale_fill_jo
update_geom_defaults("bar", list(fill = bar_colour))
update_geom_defaults("col", list(fill = bar_colour))
update_geom_defaults("point", list(colour = bar_colour))
update_geom_defaults("line", list(colour = bar_colour))
── Column specification ────────────────────────────────────────────────────────────────────────
cols(
.default = col_character(),
game_id = col_double(),
min_players = col_double(),
max_players = col_double(),
avg_time = col_double(),
min_time = col_double(),
max_time = col_double(),
year = col_double(),
geek_rating = col_double(),
num_votes = col_double(),
age = col_double(),
owned = col_double()
)
ℹ Use `spec()` for the full column specifications.
to_score <- read_csv("sliced/s01e01/test.csv", guess_max = 200000) %>%
mutate(
category11 = as.character(category11),
category12 = as.character(category12)
)
── Column specification ────────────────────────────────────────────────────────────────────────
cols(
.default = col_character(),
game_id = col_double(),
min_players = col_double(),
max_players = col_double(),
avg_time = col_double(),
min_time = col_double(),
max_time = col_double(),
year = col_double(),
num_votes = col_double(),
age = col_double(),
owned = col_double(),
category11 = col_logical(),
category12 = col_logical()
)
ℹ Use `spec()` for the full column specifications.
| Name | to_build |
| Number of rows | 3499 |
| Number of columns | 26 |
| _______________________ | |
| Column type frequency: | |
| character | 15 |
| numeric | 11 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| names | 0 | 1.00 | 1 | 82 | 0 | 3485 | 0 |
| mechanic | 0 | 1.00 | 4 | 258 | 0 | 1758 | 0 |
| category1 | 0 | 1.00 | 4 | 26 | 0 | 78 | 0 |
| category2 | 611 | 0.83 | 4 | 26 | 0 | 81 | 0 |
| category3 | 1773 | 0.49 | 4 | 26 | 0 | 72 | 0 |
| category4 | 2636 | 0.25 | 4 | 25 | 0 | 62 | 0 |
| category5 | 3098 | 0.11 | 4 | 25 | 0 | 45 | 0 |
| category6 | 3363 | 0.04 | 4 | 25 | 0 | 38 | 0 |
| category7 | 3453 | 0.01 | 4 | 25 | 0 | 21 | 0 |
| category8 | 3480 | 0.01 | 5 | 19 | 0 | 14 | 0 |
| category9 | 3494 | 0.00 | 7 | 19 | 0 | 5 | 0 |
| category10 | 3495 | 0.00 | 6 | 25 | 0 | 3 | 0 |
| category11 | 3498 | 0.00 | 9 | 9 | 0 | 1 | 0 |
| category12 | 3498 | 0.00 | 15 | 15 | 0 | 1 | 0 |
| designer | 0 | 1.00 | 4 | 157 | 0 | 1905 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| game_id | 0 | 1 | 89632.37 | 77040.84 | 2.00 | 11164.50 | 73538.0 | 160677.0 | 244522.0 | ▇▂▃▃▂ |
| min_players | 0 | 1 | 2.01 | 0.67 | 0.00 | 2.00 | 2.0 | 2.0 | 8.0 | ▂▇▁▁▁ |
| max_players | 0 | 1 | 5.06 | 7.24 | 0.00 | 4.00 | 4.0 | 6.0 | 200.0 | ▇▁▁▁▁ |
| avg_time | 0 | 1 | 117.24 | 487.78 | 0.00 | 30.00 | 60.0 | 120.0 | 22500.0 | ▇▁▁▁▁ |
| min_time | 0 | 1 | 82.53 | 214.19 | 0.00 | 30.00 | 45.0 | 90.0 | 5400.0 | ▇▁▁▁▁ |
| max_time | 0 | 1 | 116.80 | 487.84 | 0.00 | 30.00 | 60.0 | 120.0 | 22500.0 | ▇▁▁▁▁ |
| year | 0 | 1 | 1996.10 | 161.95 | -3000.00 | 2003.00 | 2011.0 | 2015.0 | 2018.0 | ▁▁▁▁▇ |
| geek_rating | 0 | 1 | 6.09 | 0.48 | 5.64 | 5.73 | 5.9 | 6.3 | 8.5 | ▇▂▁▁▁ |
| num_votes | 0 | 1 | 2006.03 | 4644.62 | 62.00 | 281.00 | 618.0 | 1640.0 | 77423.0 | ▇▁▁▁▁ |
| age | 0 | 1 | 10.43 | 3.22 | 0.00 | 8.00 | 11.0 | 12.0 | 42.0 | ▃▇▁▁▁ |
| owned | 0 | 1 | 3054.96 | 6369.31 | 49.00 | 622.50 | 1204.0 | 2723.0 | 111807.0 | ▇▁▁▁▁ |
Min. 1st Qu. Median Mean 3rd Qu. Max.
5.640 5.726 5.905 6.088 6.303 8.502
ggplot(to_build, aes(geek_rating)) +
geom_histogram(binwidth = 0.2) +
scale_y_comma() +
labs(title = 'Geek rating', x = NULL, y = NULL, fill = 'Rating > 7')Score over 7 looks good
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 2.000 2.000 2.015 2.000 8.000
ggplot(to_build, aes(min_players, fill = geek_rating >= 7)) +
geom_histogram(binwidth = 1) +
scale_y_comma() +
labs(title = 'Min players', x = NULL, y = NULL, fill = 'Rating > 7')ggplot(to_build, aes(min_players, fill = geek_rating >= 7)) +
geom_histogram(binwidth = 1, position = 'fill') +
scale_y_pct() +
labs(title = 'Min players', x = NULL, y = NULL, fill = 'Rating > 7') Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 4.000 4.000 5.061 6.000 200.000
ggplot(to_build, aes(max_players, fill = geek_rating >= 7)) +
geom_histogram(binwidth = 1) +
xlim(0, 20) +
scale_y_comma() +
labs(title = 'Max players', x = NULL, y = NULL, fill = 'Rating > 7')ggplot(to_build, aes(max_players, fill = geek_rating >= 7)) +
geom_histogram(binwidth = 1, position = 'fill') +
xlim(0, 20) +
scale_y_pct() +
labs(title = 'Max players', x = NULL, y = NULL, fill = 'Rating > 7') Min. 1st Qu. Median Mean 3rd Qu. Max.
-3000 2003 2011 1996 2015 2018
ggplot(to_build, aes(year, fill = geek_rating >= 7)) +
geom_histogram(binwidth = 10) +
scale_y_comma() +
labs(title = 'Year', x = NULL, y = NULL, fill = 'Rating > 7')to_build %>%
filter(year < 0) %>%
arrange(year) %>%
select(names, year) %>%
kable(
escape = FALSE,
padding = 5
) %>%
kable_styling(
bootstrap_options = c("striped", "hover"),
fixed_thead = TRUE,
position = "left"
) %>%
row_spec(0, color = "white", background = table_colour)| names | year |
|---|---|
| Backgammon | -3000 |
| Go | -2200 |
to_build %>%
filter(year >= 0 & year < 1900) %>%
count(year) %>%
kable(
escape = FALSE,
padding = 5
) %>%
kable_styling(
bootstrap_options = c("striped", "hover"),
fixed_thead = TRUE,
position = "left"
) %>%
row_spec(0, color = "white", background = table_colour)| year | n |
|---|---|
| 0 | 10 |
| 400 | 1 |
| 550 | 1 |
| 700 | 1 |
| 1000 | 1 |
| 1425 | 1 |
| 1600 | 2 |
| 1663 | 1 |
| 1701 | 1 |
| 1715 | 1 |
| 1780 | 1 |
| 1800 | 2 |
| 1810 | 1 |
| 1848 | 1 |
| 1850 | 1 |
| 1870 | 1 |
| 1876 | 1 |
| 1883 | 1 |
| 1895 | 1 |
Min. 1st Qu. Median Mean 3rd Qu. Max.
1904 2004 2011 2007 2015 2018
ggplot(to_build, aes(year, fill = geek_rating >= 7)) +
geom_histogram(binwidth = 10) +
xlim(1900, 2020) +
scale_y_comma() +
labs(title = 'Year', x = NULL, y = NULL, fill = 'Rating > 7')ggplot(to_build, aes(year, fill = geek_rating >= 7)) +
geom_histogram(binwidth = 10, position = 'fill') +
xlim(1900, 2020) +
scale_y_pct() +
labs(title = 'Year', x = NULL, y = NULL, fill = 'Rating > 7') Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0 30.0 60.0 117.2 120.0 22500.0
ggplot(to_build, aes(avg_time, fill = geek_rating >= 7)) +
geom_histogram(binwidth = 15) +
xlim(0, 240) +
scale_y_comma() +
labs(title = 'Average Time', x = NULL, y = NULL, fill = 'Rating > 7')ggplot(to_build, aes(avg_time, fill = geek_rating >= 7)) +
geom_histogram(binwidth = 15, position = 'fill') +
xlim(0, 240) +
scale_y_pct() +
labs(title = 'Average Time', x = NULL, y = NULL, fill = 'Rating > 7') Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 30.00 45.00 82.53 90.00 5400.00
ggplot(to_build, aes(min_time, fill = geek_rating >= 7)) +
geom_histogram(binwidth = 15) +
xlim(0, 240) +
scale_y_comma() +
labs(title = 'Min Time', x = NULL, y = NULL, fill = 'Rating > 7')ggplot(to_build, aes(min_time, fill = geek_rating >= 7)) +
geom_histogram(binwidth = 15, position = 'fill') +
xlim(0, 240) +
scale_y_pct() +
labs(title = 'Min Time', x = NULL, y = NULL, fill = 'Rating > 7') Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0 30.0 60.0 116.8 120.0 22500.0
ggplot(to_build, aes(max_time, fill = geek_rating >= 7)) +
geom_histogram(binwidth = 15) +
xlim(0, 240) +
scale_y_comma() +
labs(title = 'Max Time', x = NULL, y = NULL, fill = 'Rating > 7')ggplot(to_build, aes(max_time, fill = geek_rating >= 7)) +
geom_histogram(binwidth = 15, position = 'fill') +
xlim(0, 240) +
scale_y_pct() +
labs(title = 'Max Time', x = NULL, y = NULL, fill = 'Rating > 7') Min. 1st Qu. Median Mean 3rd Qu. Max.
62 281 618 2006 1640 77423
ggplot(to_build, aes(num_votes, fill = geek_rating >= 7)) +
geom_histogram(bins = 50) +
scale_x_log10(label = comma_format()) +
scale_y_comma() +
labs(title = 'Number of votes', x = NULL, y = NULL, fill = 'Rating > 7')ggplot(to_build, aes(num_votes, fill = geek_rating >= 7)) +
geom_histogram(bins = 50, position = 'fill') +
scale_x_log10(label = comma_format()) +
scale_y_pct() +
labs(title = 'Number of votes', x = NULL, y = NULL, fill = 'Rating > 7') Min. 1st Qu. Median Mean 3rd Qu. Max.
49.0 622.5 1204.0 3055.0 2723.0 111807.0
ggplot(to_build, aes(owned, fill = geek_rating >= 7)) +
geom_histogram(bins = 50) +
scale_x_log10(label = comma_format()) +
scale_y_comma() +
labs(title = 'Owned', x = NULL, y = NULL, fill = 'Rating > 7')ggplot(to_build, aes(owned, fill = geek_rating >= 7)) +
geom_histogram(bins = 50, position = 'fill') +
scale_x_log10(label = comma_format()) +
scale_y_pct() +
labs(title = 'Owned', x = NULL, y = NULL, fill = 'Rating > 7') Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 8.00 11.00 10.43 12.00 42.00
ggplot(to_build, aes(age, fill = geek_rating >= 7)) +
geom_histogram(binwidth = 1) +
xlim(0, 20) +
scale_y_comma() +
labs(title = 'Age', x = NULL, y = NULL, fill = 'Rating > 7')ggplot(to_build, aes(age, fill = geek_rating >= 7)) +
geom_histogram(binwidth = 1, position = 'fill') +
xlim(0, 20) +
scale_y_pct() +
labs(title = 'Age', x = NULL, y = NULL, fill = 'Rating > 7')top_cat <- to_build %>%
select(starts_with('cate')) %>%
gather(key = 'key', value = 'value') %>%
filter(! is.na(value)) %>%
count(value, sort = TRUE) %>%
#slice(1:20) %>%
select(value, n) %>%
mutate(pct = n / nrow(to_build)) %>%
filter(pct > 0.05)
ggplot(top_cat, aes(fct_reorder(value, n), pct)) +
geom_col() +
coord_flip() +
scale_y_pct() +
labs(title = 'Top categories', x = NULL, y = NULL, fill = 'Rating > 7')top_mechanic <- to_build %>%
select(mechanic) %>%
mutate(mechanic = str_split(mechanic, ',')) %>%
unnest(mechanic) %>%
mutate(mechanic = str_trim(mechanic)) %>%
count(mechanic, sort = TRUE) %>%
#slice(1:20) %>%
select(mechanic, n) %>%
mutate(pct = n / nrow(to_build)) %>%
filter(pct > 0.05)
ggplot(top_mechanic, aes(fct_reorder(mechanic, n), pct)) +
geom_col() +
coord_flip() +
scale_y_pct() +
labs(title = 'Top mechanics', x = NULL, y = NULL, fill = 'Rating > 7')top_designer <- to_build %>%
select(designer)%>%
mutate(designer = str_split(designer, ',')) %>%
unnest(designer) %>%
mutate(designer = str_trim(designer)) %>%
count(designer, sort = TRUE) %>%
filter(! designer %in% c('(Uncredited)', 'Jr.')) %>%
slice(1:10) %>%
select(designer, n) %>%
mutate(pct = n / nrow(to_build))
ggplot(top_designer, aes(fct_reorder(designer, n), pct)) +
geom_col() +
coord_flip() +
scale_y_pct(accuracy = 0.1) +
labs(title = 'Top designers', x = NULL, y = NULL, fill = 'Rating > 7')rec <- recipe(geek_rating ~ ., data = train) %>%
# id role for game_id
update_role(game_id, new_role = "ID") %>%
# flags for categories
step_mutate(
cat_card_game = if_any(starts_with("cate"), ~ .x == "Card Game"),
cat_card_game =as.numeric( replace_na(cat_card_game, FALSE)),
cat_wargame = if_any(starts_with("cate"), ~ .x == "Wargame"),
cat_wargame = as.numeric(replace_na(cat_wargame, FALSE)),
cat_fantasy = if_any(starts_with("cate"), ~ .x == "Fantasy"),
cat_fantasy = as.numeric(replace_na(cat_fantasy, FALSE)),
cat_economic = if_any(starts_with("cate"), ~ .x == "Economic"),
cat_economic = as.numeric(replace_na(cat_economic, FALSE)),
cat_fighting = if_any(starts_with("cate"), ~ .x == "Fighting"),
cat_fighting = as.numeric(replace_na(cat_fighting, FALSE)),
cat_science_fiction = if_any(starts_with("cate"), ~ .x ==
"Science Fiction"),
cat_science_fiction = as.numeric(replace_na(cat_science_fiction, FALSE)),
cat_dice = if_any(starts_with("cate"), ~ .x == "Dice"),
cat_dice = as.numeric(replace_na(cat_dice, FALSE)),
cat_bluffing = if_any(starts_with("cate"), ~ .x == "Bluffing"),
cat_bluffing = as.numeric(replace_na(cat_bluffing, FALSE)),
cat_adventure = if_any(starts_with("cate"), ~ .x == "Adventure"),
cat_adventure = as.numeric(replace_na(cat_adventure, FALSE)),
cat_miniatures = if_any(starts_with("cate"), ~ .x == "Miniatures"),
cat_miniatures = as.numeric(replace_na(cat_miniatures, FALSE)),
cat_ww2 = if_any(starts_with("cate"), ~ .x == "World War II"),
cat_ww2 = as.numeric(replace_na(cat_ww2, FALSE)),
cat_medieval = if_any(starts_with("cate"), ~ .x == "Medieval"),
cat_medieval = as.numeric(replace_na(cat_medieval, FALSE)),
cat_explor = if_any(starts_with("cate"), ~ .x == "Exploration"),
cat_explor = as.numeric(replace_na(cat_explor, FALSE)),
cat_deduct = if_any(starts_with("cate"), ~ .x == "Deduction"),
cat_deduct = as.numeric(replace_na(cat_deduct, FALSE)),
cat_party = if_any(starts_with("cate"), ~ .x == "Party Game"),
cat_party = as.numeric(replace_na(cat_party, FALSE)),
cat_abstract = if_any(starts_with("cate"), ~ .x == "Abstract Strategy"),
cat_abstract = as.numeric(replace_na(cat_abstract, FALSE)),
cat_animals = if_any(starts_with("cate"), ~ .x == "Animals"),
cat_animals = as.numeric(replace_na(cat_animals, FALSE))
) %>%
# flags for mechanic
step_mutate(
mech_dice = as.numeric(str_detect(mechanic, "Dice Rolling")),
mech_hand = as.numeric(str_detect(mechanic, "Hand Management" )),
mech_powers = as.numeric(str_detect(mechanic, "Variable Player Powers")),
mech_sets = as.numeric(str_detect(mechanic, "Set Collection")),
mech_infl = as.numeric(str_detect(mechanic,
"Area Control / Area Influence")),
mech_draft = as.numeric(str_detect(mechanic, "Card Drafting")),
mech_modular = as.numeric(str_detect(mechanic, "Modular Board")),
mech_tile = as.numeric(str_detect(mechanic, "Tile Placement")),
mech_hex = as.numeric(str_detect(mechanic, "Hex-and-Counter")),
mech_action = as.numeric(str_detect(mechanic,
"Action Point Allowance System")),
mech_coop = as.numeric(str_detect(mechanic, "Co-operative Play")),
mech_sas = as.numeric(str_detect(mechanic,
"Simultaneous Action Selection")),
mech_auction = as.numeric(str_detect(mechanic, "Auction/Bidding")),
mech_area = as.numeric(str_detect(mechanic, "Area Movement")),
mech_worker = as.numeric(str_detect(mechanic, "Worker Placement")),
mech_grid = as.numeric(str_detect(mechanic, "Grid Movement")),
mech_simul = as.numeric(str_detect(mechanic, "Simulation")),
mech_deck = as.numeric(str_detect(mechanic, "Deck / Pool Building")),
mech_partner = as.numeric(str_detect(mechanic, "Partnerships")),
mech_point = as.numeric(str_detect(mechanic, "Point to Point Movement")),
mech_route = as.numeric(str_detect(mechanic, "Route/Network Building"))
) %>%
# flags for mechanic
step_mutate(
design_1 = as.numeric(str_detect(designer, "Reiner Knizia")),
design_2 = as.numeric(str_detect(designer, "Martin Wallace")),
design_3 = as.numeric(str_detect(designer, "Wolfgang Kramer")),
design_4 = as.numeric(str_detect(designer, "Dean Essig")),
design_5 = as.numeric(str_detect(designer, "Alan R. Moon")),
design_6 = as.numeric(str_detect(designer, "Bruno Cathala")),
design_7 = as.numeric(str_detect(designer, "Friedemann Friese")),
design_8 = as.numeric(str_detect(designer, "Mike Elliott")),
design_9 = as.numeric(str_detect(designer, "Klaus Teuber")),
design_10 = as.numeric(str_detect(designer, "Richard H. Berg"))
) %>%
# remove the name, designer, mechanic and category variables
step_rm(names, mechanic, designer, category1:category12) %>%
# log transformation
step_log(num_votes, owned) %>%
# outliers
step_mutate(
min_players = case_when(
min_players == 0 ~ as.numeric(NA),
min_players > 20 ~ 20,
TRUE ~ min_players
),
max_players = case_when(
max_players == 0 ~ as.numeric(NA),
max_players > 20 ~ 20,
TRUE ~ max_players
),
min_time = case_when(
min_time == 0 ~ as.numeric(NA),
min_time > 240 ~ 240,
TRUE ~ min_time
),
max_time = case_when(
max_time == 0 ~ as.numeric(NA),
max_time > 240 ~ 240,
TRUE ~ max_time
),
avg_time = case_when(
avg_time == 0 ~ as.numeric(NA),
avg_time > 240 ~ 240,
TRUE ~ avg_time
),
year = case_when(
year == 0 ~ as.numeric(NA),
year < 1900 ~ 1900,
TRUE ~ year
)
) %>%
# imputation
step_impute_mean(min_players, max_players, year, min_time, max_time, avg_time)| Name | baked |
| Number of rows | 2623 |
| Number of columns | 59 |
| _______________________ | |
| Column type frequency: | |
| numeric | 59 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| game_id | 0 | 1 | 88298.34 | 76614.03 | 2.00 | 10906.00 | 69601.00 | 159125.00 | 244522.00 | ▇▂▂▃▂ |
| min_players | 0 | 1 | 2.02 | 0.65 | 1.00 | 2.00 | 2.00 | 2.00 | 8.00 | ▇▁▁▁▁ |
| max_players | 0 | 1 | 4.70 | 2.51 | 1.00 | 4.00 | 4.00 | 6.00 | 20.00 | ▇▆▁▁▁ |
| avg_time | 0 | 1 | 80.37 | 65.29 | 5.00 | 30.00 | 60.00 | 120.00 | 240.00 | ▇▅▂▁▂ |
| min_time | 0 | 1 | 67.91 | 59.52 | 1.00 | 30.00 | 45.00 | 90.00 | 240.00 | ▇▅▁▁▁ |
| max_time | 0 | 1 | 80.67 | 65.14 | 5.00 | 30.00 | 60.00 | 120.00 | 240.00 | ▇▆▂▁▂ |
| year | 0 | 1 | 2006.41 | 13.77 | 1900.00 | 2003.00 | 2010.00 | 2015.00 | 2018.00 | ▁▁▁▁▇ |
| num_votes | 0 | 1 | 6.62 | 1.27 | 4.14 | 5.63 | 6.43 | 7.40 | 11.26 | ▅▇▅▂▁ |
| age | 0 | 1 | 10.46 | 3.20 | 0.00 | 8.00 | 12.00 | 12.00 | 42.00 | ▃▇▁▁▁ |
| owned | 0 | 1 | 7.24 | 1.13 | 3.89 | 6.42 | 7.09 | 7.92 | 11.62 | ▁▇▇▂▁ |
| geek_rating | 0 | 1 | 6.09 | 0.49 | 5.64 | 5.73 | 5.90 | 6.30 | 8.50 | ▇▂▁▁▁ |
| cat_card_game | 0 | 1 | 0.28 | 0.45 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▃ |
| cat_wargame | 0 | 1 | 0.18 | 0.38 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| cat_fantasy | 0 | 1 | 0.15 | 0.36 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| cat_economic | 0 | 1 | 0.11 | 0.32 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| cat_fighting | 0 | 1 | 0.11 | 0.31 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| cat_science_fiction | 0 | 1 | 0.09 | 0.28 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| cat_dice | 0 | 1 | 0.08 | 0.27 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| cat_bluffing | 0 | 1 | 0.06 | 0.25 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| cat_adventure | 0 | 1 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| cat_miniatures | 0 | 1 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| cat_ww2 | 0 | 1 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| cat_medieval | 0 | 1 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| cat_explor | 0 | 1 | 0.06 | 0.23 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| cat_deduct | 0 | 1 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| cat_party | 0 | 1 | 0.05 | 0.23 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| cat_abstract | 0 | 1 | 0.05 | 0.23 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| cat_animals | 0 | 1 | 0.06 | 0.23 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| mech_dice | 0 | 1 | 0.28 | 0.45 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▃ |
| mech_hand | 0 | 1 | 0.27 | 0.44 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▃ |
| mech_powers | 0 | 1 | 0.18 | 0.39 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| mech_sets | 0 | 1 | 0.15 | 0.35 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| mech_infl | 0 | 1 | 0.13 | 0.33 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| mech_draft | 0 | 1 | 0.12 | 0.32 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| mech_modular | 0 | 1 | 0.12 | 0.32 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| mech_tile | 0 | 1 | 0.11 | 0.31 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| mech_hex | 0 | 1 | 0.09 | 0.28 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| mech_action | 0 | 1 | 0.08 | 0.27 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| mech_coop | 0 | 1 | 0.08 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| mech_sas | 0 | 1 | 0.08 | 0.27 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| mech_auction | 0 | 1 | 0.08 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| mech_area | 0 | 1 | 0.07 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| mech_worker | 0 | 1 | 0.07 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| mech_grid | 0 | 1 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| mech_simul | 0 | 1 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| mech_deck | 0 | 1 | 0.06 | 0.23 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| mech_partner | 0 | 1 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| mech_point | 0 | 1 | 0.06 | 0.23 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| mech_route | 0 | 1 | 0.06 | 0.23 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| design_1 | 0 | 1 | 0.03 | 0.16 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| design_2 | 0 | 1 | 0.01 | 0.11 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| design_3 | 0 | 1 | 0.01 | 0.10 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| design_4 | 0 | 1 | 0.01 | 0.09 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| design_5 | 0 | 1 | 0.01 | 0.10 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| design_6 | 0 | 1 | 0.01 | 0.09 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| design_7 | 0 | 1 | 0.01 | 0.09 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| design_8 | 0 | 1 | 0.01 | 0.09 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| design_9 | 0 | 1 | 0.01 | 0.08 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| design_10 | 0 | 1 | 0.01 | 0.09 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
══ Workflow ════════════════════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()
── Preprocessor ────────────────────────────────────────────────────────────────────────────────
7 Recipe Steps
• step_mutate()
• step_mutate()
• step_mutate()
• step_rm()
• step_log()
• step_mutate()
• step_impute_mean()
── Model ───────────────────────────────────────────────────────────────────────────────────────
Random Forest Model Specification (regression)
Main Arguments:
mtry = tune()
trees = 1500
min_n = tune()
Computational engine: ranger
set.seed(530)
train_folds <- vfold_cv(train)
set.seed(3621)
doParallel::registerDoParallel()
ranger_tune <-
tune_grid(wkflow,
resamples = train_folds,
grid = 11
)i Creating pre-processing data to finalize unknown parameter: mtry
show_best(ranger_tune, metric = "rmse") %>%
kable(
escape = FALSE,
padding = 5
) %>%
kable_styling(
bootstrap_options = c("striped", "hover"),
fixed_thead = TRUE,
position = "left"
) %>%
row_spec(0, color = "white", background = table_colour)| mtry | min_n | .metric | .estimator | mean | n | std_err | .config |
|---|---|---|---|---|---|---|---|
| 35 | 12 | rmse | standard | 0.1775053 | 10 | 0.0030664 | Preprocessor1_Model05 |
| 26 | 2 | rmse | standard | 0.1775596 | 10 | 0.0030883 | Preprocessor1_Model08 |
| 55 | 22 | rmse | standard | 0.1779476 | 10 | 0.0033328 | Preprocessor1_Model02 |
| 31 | 13 | rmse | standard | 0.1784322 | 10 | 0.0031729 | Preprocessor1_Model06 |
| 44 | 24 | rmse | standard | 0.1792009 | 10 | 0.0032586 | Preprocessor1_Model11 |
final_wkflow <- wkflow %>%
finalize_workflow(select_best(ranger_tune, metric = "rmse"))
final_wkflow══ Workflow ════════════════════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()
── Preprocessor ────────────────────────────────────────────────────────────────────────────────
7 Recipe Steps
• step_mutate()
• step_mutate()
• step_mutate()
• step_rm()
• step_log()
• step_mutate()
• step_impute_mean()
── Model ───────────────────────────────────────────────────────────────────────────────────────
Random Forest Model Specification (regression)
Main Arguments:
mtry = 35
trees = 1500
min_n = 12
Computational engine: ranger
collect_metrics(ranger_fit) %>%
kable(
escape = FALSE,
padding = 5
) %>%
kable_styling(
bootstrap_options = c("striped", "hover"),
fixed_thead = TRUE,
position = "left"
) %>%
row_spec(0, color = "white", background = table_colour)| .metric | .estimator | .estimate | .config |
|---|---|---|---|
| rmse | standard | 0.1802449 | Preprocessor1_Model1 |
| rsq | standard | 0.8567826 | Preprocessor1_Model1 |
ranger_preds <- collect_predictions(ranger_fit)
ranger_preds %>%
ggplot(aes(geek_rating, .pred)) +
geom_abline(lty = 2, colour = pal_sliced[3]) +
geom_point(alpha = 0.5) +
coord_fixed() +
labs(title = 'Predicted vs actual')While this was a late submission, the RMSE for my submission was 0.17972, which would put me about 10th on the private leaderboard